AULA 1

1 Introdução

Aqui começamos a escrever a introdução do nosso material.

Aqui continuamos

A seguir vamos colocando outros ítens tipográficos

1.1 Objetivos

1.1.1 Objetivo Geral

1.1.2 Objetivo Específico

1.1.2.1 Subsubseção

1.1.2.1.1 Parágrafo
1.1.2.1.1.1 Subparágrafo

1.1.2.1.1.1.1 Subsubparágrafo

Um subsubparágrafo é aceito pelo markdown mas não é definido tipograficamente. Normalmente usamos só até o nível 6, que é o subparágrafo.

2 Referencial teórico-empírico

3 Metodologia

Escrevendo uma equação

$$
y = ax^2+bx+c
$$

\[ y = ax^2+bx+c \]

4 Análise de dados e discussão dos resultados

```{r,}
2 + 2
## [1] 4
```
```{r,}
set.seed(1)
.x <- rnorm(10000)
head(.x, 10)
##  [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078 -0.8204684
##  [7]  0.4874291  0.7383247  0.5757814 -0.3053884
tail(.x, 10)
##  [1]  1.04776175 -0.02428861 -0.47787499 -0.02971747  0.20966546  0.95950757
##  [7]  0.43660362  0.49936656  0.89397983  0.25738706
```
```{r,}
hist(.x, freq = FALSE, col = "yellow", border = "black")

hist(.x, freq = FALSE, col = "lightyellow", border = "gray")
curve(dnorm(x, mean(.x), sd(.x)), xlim = c(min(.x), max(.x)),
    add = TRUE, col = "#4040FFC0", lwd = 2.5)

```
```{r,}
1/2
## [1] 0.5
```

5 Dados do Principles of Econometrics

5.1 Pacote PoEdata_0.1.0.tar.gz

Uma vez instalado o pacote PoEdata_0.1.0.tar.gz

```{r,}
library(PoEdata)
```
```{r,}
library(printr)
## Registered S3 method overwritten by 'printr':
##   method                from     
##   knit_print.data.frame rmarkdown
data(mroz)
head(mroz[, 1:5])
taxableinc federaltax hsiblings hfathereduc hmothereduc
12200 1494 1 14 16
18000 2615 8 7 3
24000 3957 4 7 10
16400 2279 6 7 12
10000 1063 3 7 7
6295 370 8 7 7
tail(mroz[, 1:5])
taxableinc federaltax hsiblings hfathereduc hmothereduc
748 16100 1825 0 7 12
749 32000 4701 8 12 7
750 18500 2720 4 12 12
751 13000 1642 6 7 12
752 17200 2447 2 7 10
753 18700 2327 4 10 7
```

5.2 Pequeno exemplo de programação em R

```{r,warning=FALSE}
library(DescTools)
plotSquare = function(deltay = 1.5, deltax = abs(deltay/Asp()),
    xbase = 12, ybase = 3, col = "#FF808040") {
    polygon(c(0, deltax, deltax, 0, 0) + xbase, c(0, 0, deltay,
        deltay, 0) + ybase, col = col)
}
plotRegressao = function(modelo1 = modelo1, horas = horas, nota = nota,
    sequencia = 5, sub = NULL) {
    # desenho os pontos observados
    plot(horas, nota, pch = 20, col = "darkgray", xlim = c(0,
        14), ylim = c(0, 100), axes = FALSE, sub = "Regressão linear simples",
        xlab = "Horas de estudo", ylab = "Nota na avaliação",
        main = sub)
    # desenho os eixos no (0, 0)
    axis(1, pos = 0)
    axis(2, pos = 0)
    # traça a linha do modelo1 em azul
    if (sequencia > 1)
        abline(modelo1, col = "blue")
    # calcula os pontos sobre a reta
    estimados <- predict(modelo1, horas = horas)
    # desenha os pontos sobre a reta
    if (sequencia > 2)
        points(horas, estimados, pch = 20, col = "blue")
    # desenha as barras de erro (y - ychapéu) e dá nome aos
    # pontos
    delta = 0.3
    if (sequencia > 3) {
        for (i in 1:length(horas)) {
            # desenha as barras de erro verticais
            lines(c(horas[i], horas[i]), c(estimados[i], nota[i]),
                col = "red")
            # desenha as linhas horizontais
            lines(c(horas[i] - delta, horas[i] + delta), c(estimados[i],
                estimados[i]), col = "red")
            lines(c(horas[i] - delta, horas[i] + delta), c(nota[i],
                nota[i]), col = "red")
            # coloca o nome do ponto acima ou abaixo dele
            # conforme a estética
            text(horas[i], nota[i], bquote(y[.(i)]), pos = ifelse(estimados[i] >
                nota[i], 1, 3))
        }
    }
    if (sequencia > 4) {
        for (i in 1:length(horas)) {
            deltay = nota[i] - estimados[i]
            deltax = abs(deltay/Asp())
            if (deltay > 0)
                deltax = -deltax
            plotSquare(xbase = horas[i], ybase = estimados[i],
                deltay = deltay, deltax = deltax)
        }
    }
    text(8, 10, expression(min(Sigma(y[i] - bar(y[i]))^2)), cex = 1.2)
    text(8, 15, "Mínimos quadrados ordinários minimiza o somatório dos quadrados dos erros",
        cex = 0.7)
}
```
```{r,fig.asp=1, fig.cap="Nota na avaliação *versus* horas de estudo", fig.align='center'}
# conjuntos de dados
nota <- c(40, 30, 60, 65, 70, 90)
horas <- c(2, 4, 6, 8, 10, 12)
# função lm (linear model) -> guarda em modelo1
lm(nota ~ horas) -> modelo1
plotRegressao(modelo1, horas, nota, 1, expression("Nuvem de pontos"))
Nota na avaliação *versus* horas de estudo

Figure 5.1: Nota na avaliação versus horas de estudo

plotRegressao(modelo1, horas, nota, 2, expression(paste("Reta de regressão: ",
    nota == b[0] + b[1] * horas)))
Nota na avaliação *versus* horas de estudo

Figure 5.2: Nota na avaliação versus horas de estudo

plotRegressao(modelo1, horas, nota, 3, expression(paste("Reta de regressão com as estimativas ",
    hat(y))))
Nota na avaliação *versus* horas de estudo

Figure 5.3: Nota na avaliação versus horas de estudo

plotRegressao(modelo1, horas, nota, 4, expression(paste("Erros observados: ",
    y - hat(y))))
Nota na avaliação *versus* horas de estudo

Figure 5.4: Nota na avaliação versus horas de estudo

plotRegressao(modelo1, horas, nota, 5, expression(paste("Quadrados dos erros: ",
    (y - hat(y))^2)))
Nota na avaliação *versus* horas de estudo

Figure 5.5: Nota na avaliação versus horas de estudo

```
```{r,}
modelo1
## 
## Call:
## lm(formula = nota ~ horas)
## 
## Coefficients:
## (Intercept)        horas  
##      21.667        5.357
modelo1$coefficients[1]
## (Intercept) 
##    21.66667
modelo1$coefficients[2]
##    horas 
## 5.357143
```
$$
\widehat{\text{nota}} = b_0+b_1\cdot \text{horas} = 21.6666667+5.3571429\cdot \text{horas}
$$

\[ \widehat{\text{nota}} = b_0+b_1\cdot \text{horas} = 21.6666667+5.3571429\cdot \text{horas} \]

Diagnósticos do modelo

Estatística = testes de hipóteses

Uma hipótese pode ser rejeitada ou não

Existe um valor calculado para cada teste que se chama p.value (p-valor) para o qual existe um valor crítico, normalmente tomado como 0,05 (ou 5%) que chamamos de significância do teste.

Todo teste tem uma hipótese nula (\(H_0\)), se o p-valor for menor que o limite, rejeita-se \(H_0\), se for igual ou maior, aceita-se \(H_0\).

\(H_0\) do teste F: não há relação entre as variáveis.

\(H_0\) do teste t: o coeficiente associado é igual a zero.

```{r,}
summary(nota)
Min. 1st Qu. Median Mean 3rd Qu. Max.
30 45 62.5 59.16667 68.75 90
summary(horas)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2 4.5 7 7 9.5 12
summary(.x)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.6713 -0.6733944 -0.0159288 -0.006537 0.6776605 3.810277
summary(modelo1)
## 
## Call:
## lm(formula = nota ~ horas)
## 
## Residuals:
##        1        2        3        4        5        6 
##   7.6190 -13.0952   6.1905   0.4762  -5.2381   4.0476 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   21.667      8.221   2.636   0.0578 . 
## horas          5.357      1.055   5.076   0.0071 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.83 on 4 degrees of freedom
## Multiple R-squared:  0.8656, Adjusted R-squared:  0.832 
## F-statistic: 25.76 on 1 and 4 DF,  p-value: 0.007102
```

\(R^2\) é a porção de variação da nota que é explicada pelas horas.

Ou seja, aproximadamente 83% da variação da nota é explicada pelas horas de estudo.

Manipulação de gráficos e expressões

O RStudio é bastante competente para lidar com elementos gráficos e com equações, tornando-o apropriado para a escrita acadêmica de textos dessa natureza.

```{r,}
x = 1:6 * 10
y = 1:6
plot(x, y)
plotSquare(xbase = 20, ybase = 2, deltay = 0.5)
plotSquare(xbase = 30, ybase = 3, deltay = 1.5, col = "#8080FF40")
plotSquare(xbase = 40, ybase = 4, deltay = -2.5, col = "#80FF8040")
plotSquare(xbase = 50, ybase = 5, deltay = -1.5, col = "#FFFF8040")
abline(c(0, 0.1), col = "darkgray")

```
```{r,}
f = function(x) (x^3 + 8)/(x^4 - 16)
x = seq(-3, -1.0000001, length.out = 100)
y = f(x)
plot(x, y, type = "l")
abline(v = -2, h = -3/8, col = "lightblue")
text(-2.75, -0.4, expression(y == lim(frac(x^3 + 8, x^4 - 16),
    x %->% -2)))
text(-1.75, -0.35, expression(y(-2) == "?"))

f(-2.000000001)
## [1] -0.375
-3/8
## [1] -0.375
```

Os que estudaram Cálculo talvez se lembrem da regra de L’Hôpital:

\[ \lim_{x\rightarrow-2}\dfrac{x^3+8}{x^4-16}=\lim_{x\rightarrow-2}\dfrac{\rm d(x^3+8)/\rm dx}{\rm d(x^4-16)/\rm dx}= \lim_{x\rightarrow-2}\dfrac{3x^2}{4x^3} = \dfrac{3(-2)^2}{4(-2)^3} = \dfrac{12}{-32} = -\dfrac{3}{8}=-0.375 \] Mais um exemplo:

```{r,}
f = function(x) (exp(x) - 1)/(x)
x = seq(-3, 1, length.out = 100)
y = f(x)
plot(x, y, type = "l")
abline(v = 0, h = 1, col = "lightblue")
text(-2.5, 1.5, expression(y == lim(frac(e^x - 1, x), x %->%
    0)))
text(-2.5, 1.2, expression(y(0) == "?"))

```
```{r,}
plot.new()
plot.window(c(0, 4), c(15, 1))
text(1, 1, "universal", adj = 0)
text(2.5, 1, "\\042")
text(3, 1, expression(symbol("\"")))
text(1, 2, "existential", adj = 0)
text(2.5, 2, "\\044")
text(3, 2, expression(symbol("$")))
text(1, 3, "suchthat", adj = 0)
text(2.5, 3, "\\047")
text(3, 3, expression(symbol("'")))
text(1, 4, "therefore", adj = 0)
text(2.5, 4, "\\134")
text(3, 4, expression(symbol("\\")))
text(1, 5, "perpendicular", adj = 0)
text(2.5, 5, "\\136")
text(3, 5, expression(symbol("^")))
text(1, 6, "circlemultiply", adj = 0)
text(2.5, 6, "\\304")
text(3, 6, expression(symbol("\xc4")))
text(1, 7, "circleplus", adj = 0)
text(2.5, 7, "\\305")
text(3, 7, expression(symbol("\xc5")))
text(1, 8, "emptyset", adj = 0)
text(2.5, 8, "\\306")
text(3, 8, expression(symbol("\xc6")))
text(1, 9, "angle", adj = 0)
text(2.5, 9, "\\320")
text(3, 9, expression(symbol("\xd0")))
text(1, 10, "leftangle", adj = 0)
text(2.5, 10, "\\341")
text(3, 10, expression(symbol("\xe1")))
text(1, 11, "rightangle", adj = 0)
text(2.5, 11, "\\361")
text(3, 11, expression(symbol("\xf1")))

```

AULA 2

```{r,}
library(magrittr)
```

6 Regressão linear simples

6.1 Modelo geral

```{r,}
library(PoEdata)
data("cps_small")
plot(cps_small$educ, cps_small$wage, xlab = "education", ylab = "wage")

plot(cps_small$educ, cps_small$wage, xlab = "Educação", ylab = "Renda")

```
```{r,}
head(cps_small, 15)
wage educ exper female black white midwest south west
2.03 13 2 1 0 1 0 1 0
2.07 12 7 0 0 1 1 0 0
2.12 12 35 0 0 1 0 1 0
2.54 16 20 1 0 1 0 1 0
2.68 12 24 1 0 1 0 1 0
3.09 13 4 0 0 1 0 1 0
3.16 13 1 0 0 1 0 0 1
3.17 12 22 1 0 1 0 1 0
3.20 12 23 0 0 1 0 1 0
3.27 12 4 1 0 1 0 0 1
3.32 12 11 1 0 1 0 0 1
3.32 13 3 1 0 1 1 0 0
3.34 18 15 0 0 1 1 0 0
3.39 13 7 1 0 1 0 0 0
3.39 12 15 1 0 1 0 0 1
```
```{r,}
plot(cps_small$exper, cps_small$wage, col = "gray", pch = 20)

```

6.2 Example: Food Expenditure versus Income

```{r,}
library(PoEdata)
data(food)
head(food)
food_exp income
115.22 3.69
135.98 4.39
119.34 4.75
114.96 6.03
187.05 12.47
243.92 12.98
# help(food)
```
```{r,}
data("food", package = "PoEdata")
plot(food$income, food$food_exp)

# Gráfico de dispersão ou scatter plot
plot(food$income, food$food_exp, ylim = c(0, max(food$food_exp)),
    xlim = c(0, max(food$income)), xlab = "weekly income in $100",
    ylab = "weekly food expenditure in $", type = "p")

```

6.3 Estimating a Linear Regression

\[ food\_exp = \beta_0 + \beta_1 income +e\\[1em] \widehat{food\_exp} = \beta_0 + \beta_1 income \]

```{r,}
library(PoEdata)
# roda a regressão
mod1 <- lm(formula = food_exp ~ income, data = food)
# olha os coeficientes
mod1$coefficients
## (Intercept)      income 
##    83.41600    10.20964
# ou
coef(mod1)
## (Intercept)      income 
##    83.41600    10.20964
# um por um
mod1$coefficients[1]
## (Intercept) 
##      83.416
mod1$coefficients[2]
##   income 
## 10.20964
# ou
(b1 <- coef(mod1)[[1]])
## [1] 83.416
(b2 <- coef(mod1)[[2]])
## [1] 10.20964
# mostra o resultado da regressão
smod1 <- summary(mod1)
smod1
## 
## Call:
## lm(formula = food_exp ~ income, data = food)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -223.025  -50.816   -6.324   67.879  212.044 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   83.416     43.410   1.922   0.0622 .  
## income        10.210      2.093   4.877 1.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared:  0.385,  Adjusted R-squared:  0.3688 
## F-statistic: 23.79 on 1 and 38 DF,  p-value: 1.946e-05
```
```{r,}
plot(food$income, food$food_exp, xlab = "Renda semanal em $100",
    ylab = "Despesa com comida em $", ylim = c(0, max(food$food_exp)),
    xlim = c(0, max(food$income)), type = "p", col = "lightblue",
    pch = 16, frame.plot = FALSE, axes = FALSE, main = "Despesa com comida versus renda")
axis(1, pos = 0)
axis(2, pos = 0)
# abline(b1,b2)
abline(mod1, col = "blue")
abline(h = b1, col = "darkgray", lty = 2)

```

6.4 Prediction with the Linear Regression Model

Qual a despesa com comida de um indivíduo que ganha $2000 por semana?

\[ \widehat{food\_exp} = 83.416 + 10.210 \cdot income\\ \widehat{food\_exp} = 83.416 + 10.210 \cdot 20\\ \]

```{r,}
coef(mod1)[1] + coef(mod1)[2] * 20
## (Intercept) 
##    287.6089
predict(mod1, data.frame(income = 20))
##        1 
## 287.6089
```

6.5 Repeated Samples to Assess Regression Coefficients

Tecnicamente isso se chama bootstrap e trataremos depois.

6.6 Estimated Variances and Covariance of Regression Coefficients

Será útil mais tarde.

6.7 Non-Linear Relationships

```{r,}
library(PoEdata)
data(br)
# testando uma relação quadrática
mod3 <- lm(formula = price ~ I(sqft^2), data = br)
# versus uma do primeiro grau
mod3.a <- lm(formula = price ~ sqft, data = br)
(summary(mod3) -> s3)
## 
## Call:
## lm(formula = price ~ I(sqft^2), data = br)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -696604  -23366     779   21869  713159 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.578e+04  2.890e+03   19.30   <2e-16 ***
## I(sqft^2)   1.542e-02  3.131e-04   49.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68210 on 1078 degrees of freedom
## Multiple R-squared:  0.6923, Adjusted R-squared:  0.6921 
## F-statistic:  2426 on 1 and 1078 DF,  p-value: < 2.2e-16
(summary(mod3.a) -> s3a)
## 
## Call:
## lm(formula = price ~ sqft, data = br)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -366641  -31399   -1535   25601  932272 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -60861.462   6110.187  -9.961   <2e-16 ***
## sqft            92.747      2.411  38.476   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79820 on 1078 degrees of freedom
## Multiple R-squared:  0.5786, Adjusted R-squared:  0.5783 
## F-statistic:  1480 on 1 and 1078 DF,  p-value: < 2.2e-16
# desenhando os dados com as curvas de regressão
plot(br$sqft, br$price, pch = 20, col = "gray")
x = seq(min(br$sqft), max(br$sqft), length.out = 100)
y = predict(mod3, data.frame(sqft = x))
abline(mod3.a, col = "red", lwd = 2)
lines(x, y, col = "blue", lwd = 2)
legend("topleft", legend = c(bquote(paste("Primeiro grau: ",
    R[aj]^2 == .(s3a$adj.r.squared %>%
        round(4)))), bquote(paste("Segundo grau: ", R[aj]^2 ==
    .(s3$adj.r.squared %>%
        round(4))))), cex = 0.8, fill = c("red", "blue"))
grid()

b1 <- coef(mod3)[[1]]
b2 <- coef(mod3)[[2]]
sqftx = c(2000, 4000, 6000)  #given values for sqft
pricex = b1 + b2 * sqftx^2  #prices corresponding to given sqft 
DpriceDsqft <- 2 * b2 * sqftx  # marginal effect of sqft on price
elasticity = DpriceDsqft * sqftx/pricex
b1
## [1] 55776.57
b2
## [1] 0.0154213
DpriceDsqft
## [1]  61.68521 123.37041 185.05562
elasticity  #prints results
## [1] 1.050303 1.631251 1.817408
```

6.7.1 Verificando a variável dependente

```{r,}
hist(br$price)

```

Transformação de variáveis

```{r,}
hist(log(br$price))

```

6.7.2 Transformação logarítmica

```{r,}
plot(br$sqft, log(br$price))

```

6.8 Using Indicator Variables in a Regression

Variável indicadora = dummy

\[ dummy \in \{0, 1\} \]

utown = university town

```{r,}
data(utown)
head(utown)
price sqft age utown pool fplace
205.452 23.46 6 0 0 1
185.328 20.03 5 0 0 1
248.422 27.77 6 0 0 0
154.690 20.17 1 0 0 0
221.801 26.45 0 0 0 1
199.119 21.56 6 0 0 1
```
```{r,}
mod5 = lm(price ~ utown, data = utown)
summary(mod5)
## 
## Call:
## lm(formula = price ~ utown, data = utown)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -85.672 -20.359  -0.462  20.646  67.955 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  215.732      1.318  163.67   <2e-16 ***
## utown         61.509      1.830   33.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.91 on 998 degrees of freedom
## Multiple R-squared:  0.5311, Adjusted R-squared:  0.5306 
## F-statistic:  1130 on 1 and 998 DF,  p-value: < 2.2e-16
```

Fora de utown preço = 215.732 (215.7324948)

Dentro de utown preço = 215.7324948 + 61.5091064 = 277.2416012

```{r,}
mean(utown[utown$utown == 1, "price"])
## [1] 277.2416
mean(utown[utown$utown == 0, "price"])
## [1] 215.7325
```
```{r,}
library(magrittr)
utown[utown$utown == 1, "price"] %>%
    mean
## [1] 277.2416
utown[utown$utown == 0, "price"] %>%
    mean
## [1] 215.7325
```

6.9 Monte Carlo

Vamos ver depois

7 Chapter 3 Interval Estimation and Hypothesis Testing

7.1 Example: Confidence Intervals in the food Model

```{r,}
library(PoEdata)
data("food")
alpha <- 0.05  # chosen significance level
mod1 <- lm(food_exp ~ income, data = food)
b2 <- coef(mod1)[[2]]
df <- df.residual(mod1)  # degrees of freedom
smod1 <- summary(mod1)
seb2 <- coef(smod1)[2, 2]  # se(b2)
tc <- qt(1 - alpha/2, df)
lowb <- b2 - tc * seb2  # lower bound
upb <- b2 + tc * seb2  # upper bound
c(lowb, b2, upb)
## [1]  5.972052 10.209643 14.447233
```

Tenho 1-significância = 1 - 0.05 = 0.95 = 95% de CONFIANÇA que o valor do coeficiente angular está situado entre 5.9720525 e 14.4472334.

```{r,}
confint(mod1, level = 0.95)
2.5 % 97.5 %
(Intercept) -4.463279 171.29528
income 5.972053 14.44723
```

7.2 Bootstrap

Reamostragem

```{r,}
# Travo o gerador de números pseudo-aleatórios
set.seed(1)
# Número de simulações
N = 500  # coloquei 500 para rodar mais rápido, na prática usa-se 2000 ou mais
# Número de elementos na amostra
nrow(br) -> n
# Inicializo vetor de valores
valores = NULL
# loop de reamostragem
for (i in 1:N) {
    # crio amostra de tamanho n com repetição
    sample(1:n, n, replace = TRUE) -> idx
    # faço a regressão
    lm(price ~ I(sqft^2), data = br[idx, ]) -> modb
    # guardo o valor do coeficiente angular
    valores = c(valores, modb$coefficients[2])
}
```
```{r,}
# desenho um histograma com uma curva normal superimposta
# Este esquema de cores é somente um exemplo, adote um
# padrão para todos os gráficos para não ficar um
# 'carnaval'
hist(valores, freq = FALSE, col = "#FFA00080", border = "white")
curve(dnorm(x, mean(valores), sd(valores)), xlim = c(min(valores),
    max(valores)), add = TRUE, col = "darkgreen", lwd = 2)

c(mod3$coefficients[2], mean(valores))
##  I(sqft^2)            
## 0.01542130 0.01535264
# Teste de normalidade (veremos em um futuro próximo)
shapiro.test(sample(valores, min(500, length(valores))))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(valores, min(500, length(valores)))
## W = 0.99399, p-value = 0.04537
```

8 Adendo - ler dados do EXCEL

```{r,}
# file.choose()
library(openxlsx)
read.xlsx("/Users/jfrega/Downloads/DadosTeste.xlsx", sheet = "Planilha1",
    startRow = 1) -> meusDados
plot(meusDados$x, meusDados$y)

lm(y ~ x, meusDados) -> m
m %>%
    summary
## 
## Call:
## lm(formula = y ~ x, data = meusDados)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9643 -1.2054  0.2679  1.1696  1.5000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.6429     1.1198   5.932  0.00102 ** 
## x             2.1071     0.2218   9.502 7.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.437 on 6 degrees of freedom
## Multiple R-squared:  0.9377, Adjusted R-squared:  0.9273 
## F-statistic: 90.29 on 1 and 6 DF,  p-value: 7.745e-05
```

9 Adendo — Regressão OLS em Python

#````

```{r,engine="python"}
#
# ATENÇÃO: para rodar este trecho do código é necessário ter o Python instalado e configurado
#
# o comando import do Python é similar ao comando library do R
# statsmodels.formula.api é a interface para os modelos estatísticos
import statsmodels.formula.api as smf
# vou acessar os dados que foram lidos no meu código em R
r.meusDados
##      x     y
## 0  1.0  10.0
## 1  2.0  12.0
## 2  3.0  11.0
## 3  4.0  15.0
## 4  5.0  16.0
## 5  6.0  18.0
## 6  7.0  22.0
## 7  8.0  25.0
# rodo o modelo OLS
model = smf.ols(formula="y~x", data=r.meusDados)
# inspeciono os resultados
print(model.fit().summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                      y   R-squared:                       0.938
## Model:                            OLS   Adj. R-squared:                  0.927
## Method:                 Least Squares   F-statistic:                     90.29
## Date:                Wed, 02 Apr 2025   Prob (F-statistic):           7.75e-05
## Time:                        09:54:00   Log-Likelihood:                -13.102
## No. Observations:                   8   AIC:                             30.20
## Df Residuals:                       6   BIC:                             30.36
## Df Model:                           1                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept      6.6429      1.120      5.932      0.001       3.903       9.383
## x              2.1071      0.222      9.502      0.000       1.565       2.650
## ==============================================================================
## Omnibus:                        2.183   Durbin-Watson:                   1.522
## Prob(Omnibus):                  0.336   Jarque-Bera (JB):                0.848
## Skew:                          -0.279   Prob(JB):                        0.654
## Kurtosis:                       1.505   Cond. No.                         11.5
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## 
## /Users/jfrega/Library/r-miniconda-arm64/lib/python3.10/site-packages/scipy/stats/_axis_nan_policy.py:418: UserWarning: `kurtosistest` p-value may be inaccurate with fewer than 20 observations; only n=8 observations were given.
##   return hypotest_fun_in(*args, **kwds)
```

#````

AULA 3

```{r,}
library(magrittr)
```

9.1 Forecasting (Predicting a Particular Value)

```{r,}
library(PoEdata)
data("food")
plot(x = food$income, y = food$food_exp, xlim = c(0, max(food$income)),
    ylim = c(0, max(food$food_exp)), axes = FALSE)
axis(1, pos = 0)
axis(2, pos = 0)
alpha <- 0.05
x <- 20
xbar <- mean(food$income)
ybar <- mean(food$food_exp)
m1 <- lm(formula = food_exp ~ income, data = food)
abline(m1)
points(xbar, ybar, col = "red", pch = 16, cex = 2)
abline(h = ybar, v = xbar, col = "red", lty = 2)
predict(m1, data.frame(income = 20), interval = "confidence",
    level = 0.95)
fit lwr upr
287.6089 258.9069 316.3108
predict(m1, data.frame(income = 20), interval = "prediction",
    level = 0.95)
fit lwr upr
287.6089 104.1323 471.0854
summary(m1) -> sm1
sm1
## 
## Call:
## lm(formula = food_exp ~ income, data = food)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -223.025  -50.816   -6.324   67.879  212.044 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   83.416     43.410   1.922   0.0622 .  
## income        10.210      2.093   4.877 1.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared:  0.385,  Adjusted R-squared:  0.3688 
## F-statistic: 23.79 on 1 and 38 DF,  p-value: 1.946e-05
xx = seq(min(food$income), max(food$income), length.out = 50)
yy = predict(m1, data.frame(income = xx), interval = "prediction",
    level = 1 - 0.05)
lines(xx, yy[, 2], col = "blue", lty = 3)
lines(xx, yy[, 3], col = "blue", lty = 3)

```

\[ \text{food_exp}= b_0+b_1\ \text{income}+e\\ \widehat{\text{food_exp}}= 83.416+10.210\ \text{income} \]

9.2 Goodness-of-Fit

Bondade de ajuste

```{r,}
sm1
## 
## Call:
## lm(formula = food_exp ~ income, data = food)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -223.025  -50.816   -6.324   67.879  212.044 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   83.416     43.410   1.922   0.0622 .  
## income        10.210      2.093   4.877 1.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared:  0.385,  Adjusted R-squared:  0.3688 
## F-statistic: 23.79 on 1 and 38 DF,  p-value: 1.946e-05
sm1$r.squared
## [1] 0.3850022
sm1$adj.r.squared
## [1] 0.3688181
sm1$fstatistic
##    value    numdf    dendf 
## 23.78884  1.00000 38.00000
```

9.3 Linear-Log Models

```{r,}
mod2 <- lm(food_exp ~ log(income), data = food)
summary(mod2)
## 
## Call:
## lm(formula = food_exp ~ log(income), data = food)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -215.427  -51.666    2.186   47.819  241.548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -97.19      84.24  -1.154    0.256    
## log(income)   132.17      28.80   4.588 4.76e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 91.57 on 38 degrees of freedom
## Multiple R-squared:  0.3565, Adjusted R-squared:  0.3396 
## F-statistic: 21.05 on 1 and 38 DF,  p-value: 4.76e-05
```

9.4 Residuals and Diagnostics

9.4.1 Normalidade dos resíduos

```{r,}
# resíduos padronizados tem média zero e desvio-padrão um
m1$residuals %>%
    scale -> padres
padres %>%
    hist(freq = FALSE)
curve(dnorm(x, 0, 1), xlim = c(-3, 3), add = TRUE)

```
```{r,}
# teste de Shapiro-Wilk H0: não há desvios da normalidade
# (p > 0.05)
padres %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.98838, p-value = 0.9493
```
```{r,}
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(magrittr)
# Teste de Jarque-Bera H0: não há desvios da normalidade (p
# > 0.05)
padres %>%
    jarque.bera.test()
## 
##  Jarque Bera Test
## 
## data:  .
## X-squared = 0.06334, df = 2, p-value = 0.9688
```

Testes de normalidade funcionam bem para n > 30 e n < 400 (valores empíricos e aproximados).

```{r,}
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
padres %>%
    qqPlot()

## [1] 31 38
```
```{r,}
set.seed(1)
rexp(20) %>%
    qqPlot

## [1] 14  6
runif(20) %>%
    qqPlot

## [1] 10 18
```

9.5 Teste de forma funcional

Teste RESET de Ramsey

```{r,}
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Teste RESET de Ramsey H0: forma funcional é adequada
reset(m1)
## 
##  RESET test
## 
## data:  m1
## RESET = 0.066009, df1 = 2, df2 = 36, p-value = 0.9362
```

Forma funcional adequada é que a representação da equação é funcionalmente adequada, ou seja, os termos estão nas potências e funções certas.

```{r,}
set.seed(12)
xx = seq(-3, 3, length.out = 50)
yy = 3 * xx^2 + 2 * rnorm(xx)
plot(xx, yy)

```
```{r,}
mteste = lm(yy ~ xx)
summary(mteste)
## 
## Call:
## lm(formula = yy ~ xx)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.114  -6.409  -3.041   5.847  19.216 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.0815     1.1687   7.770  4.9e-10 ***
## xx            0.1050     0.6614   0.159    0.874    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.264 on 48 degrees of freedom
## Multiple R-squared:  0.0005252,  Adjusted R-squared:  -0.0203 
## F-statistic: 0.02522 on 1 and 48 DF,  p-value: 0.8745
```
```{r,}
reset(mteste)
## 
##  RESET test
## 
## data:  mteste
## RESET = 537.61, df1 = 2, df2 = 46, p-value < 2.2e-16
```

O teste RESET rejeitou a forma funcional utilizada.

```{r,}
mteste2 = lm(yy ~ I(xx^2))
reset(mteste2)
## 
##  RESET test
## 
## data:  mteste2
## RESET = 1.2401, df1 = 2, df2 = 46, p-value = 0.2988
```

Opa, agora a forma funcional é adequada, é quadrática!

```{r,}
set.seed(12)
xx = seq(-3, 3, length.out = 50)
yy = 3 * xx^3 + 4 * rnorm(xx)
plot(xx, yy)

```
```{r,}
mteste3 = lm(yy ~ I(xx^3))
reset(mteste3, power = 3)
## 
##  RESET test
## 
## data:  mteste3
## RESET = 0.0016132, df1 = 1, df2 = 47, p-value = 0.9681
summary(mteste3)
## 
## Call:
## lm(formula = yy ~ I(xx^3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7571 -2.3349 -0.2141  1.9016  8.8388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.57172    0.49085  -1.165     0.25    
## I(xx^3)      3.04184    0.04533  67.099   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.471 on 48 degrees of freedom
## Multiple R-squared:  0.9895, Adjusted R-squared:  0.9892 
## F-statistic:  4502 on 1 and 48 DF,  p-value: < 2.2e-16
```
```{r,}
plot(xx, yy)
abline((lm(yy ~ xx) -> mteste1), col = "green")
mteste2 = lm(yy ~ I(xx^2))
lines(xx, predict(mteste2, data.frame(xx = xx)), col = "red")
mteste3 = lm(yy ~ I(xx^3))
lines(xx, predict(mteste3, data.frame(xx = xx)), col = "blue")
legend("topleft", legend = c("Grau 1", "Grau 2", "Grau 3"), fill = c("green",
    "red", "blue"), cex = 0.7)

```
```{r,}
mteste1 %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 377.37, df1 = 2, df2 = 46, p-value < 2.2e-16
mteste1$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.96821, p-value = 0.1956
mteste2 %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 0.011796, df1 = 2, df2 = 46, p-value = 0.9883
mteste2$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.94613, p-value = 0.02371
mteste3 %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 0.46335, df1 = 2, df2 = 46, p-value = 0.6321
mteste3$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.96619, p-value = 0.1613
summary(mteste3)
## 
## Call:
## lm(formula = yy ~ I(xx^3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7571 -2.3349 -0.2141  1.9016  8.8388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.57172    0.49085  -1.165     0.25    
## I(xx^3)      3.04184    0.04533  67.099   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.471 on 48 degrees of freedom
## Multiple R-squared:  0.9895, Adjusted R-squared:  0.9892 
## F-statistic:  4502 on 1 and 48 DF,  p-value: < 2.2e-16
```

9.6 Heteroskedasticity (Heteroscedasticidade)

```{r,}
plot(m1$model[, 2:1])
abline(m1)

```
```{r,}
# Teste de Breusch-Pagan H0: os dados são homoscedásticos
m1 %>%
    bptest
## 
##  studentized Breusch-Pagan test
## 
## data:  .
## BP = 7.3844, df = 1, p-value = 0.006579
```

O p-valor abaixo de 0.05 rejeitou a H0, ou seja, os dados são heteroscedásticos.

9.7 Modelo log-log

```{r,}
data("newbroiler", package = "PoEdata")
mod.a <- lm(q ~ p, newbroiler)
plot(mod.a$model[, 2:1])
abline(mod.a)

mod.a$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.88678, p-value = 0.0001358
mod.a$residuals %>%
    qqPlot(main = "Resíduos muito ruins")

## [1] 52 50
mod.a %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 41.974, df1 = 2, df2 = 48, p-value = 2.885e-11
# mod.a é inadequado
mod6 <- lm(log(q) ~ log(p), data = newbroiler)
plot(mod6$model[, 2:1])
abline(mod6)

mod6$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.97287, p-value = 0.2787
mod6$residuals %>%
    scale %>%
    hist

mod6$residuals %>%
    qqPlot(main = "Resíduos mais adequados")

## [1] 34 50
mod6 %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 9.0626, df1 = 2, df2 = 48, p-value = 0.0004581
nrow(newbroiler)
## [1] 52
mod6 %>%
    bptest
## 
##  studentized Breusch-Pagan test
## 
## data:  .
## BP = 2.5034, df = 1, p-value = 0.1136
mod6.a <- lm(log(q) ~ I(log(p)^3), data = newbroiler)
mod6.a$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.94107, p-value = 0.01238
mod6.a$residuals %>%
    scale %>%
    hist

mod6.a$residuals %>%
    qqPlot()

## [1] 52 51
mod6.a %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 32.957, df1 = 2, df2 = 48, p-value = 9.816e-10
mod.a.a <- lm(q ~ p + I(p^2) + I(p^3), newbroiler)
# aproximação de terceiro grau
mod.a.a$residuals %>%
    scale %>%
    shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.94371, p-value = 0.01589
mod.a.a$residuals %>%
    scale %>%
    hist

mod.a.a %>%
    reset
## 
##  RESET test
## 
## data:  .
## RESET = 2.6918, df1 = 2, df2 = 46, p-value = 0.07843
```
```{r,}
summary(mod6)
## 
## Call:
## lm(formula = log(q) ~ log(p), data = newbroiler)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.228363 -0.080077 -0.007662  0.106041  0.218679 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.71694    0.02236   166.2   <2e-16 ***
## log(p)      -1.12136    0.04876   -23.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.118 on 50 degrees of freedom
## Multiple R-squared:  0.9136, Adjusted R-squared:  0.9119 
## F-statistic:   529 on 1 and 50 DF,  p-value: < 2.2e-16
# R^2 generalizado
cor(mod6$fitted.values, log(newbroiler$q))^2
## [1] 0.9136386
```
```{r,}
modelo.linear.newbroiler = lm(q ~ p, data = newbroiler)
modelo.quadratico.newbroiler = lm(q ~ I(p^2), data = newbroiler)
modelo.quadratico.newbroiler %>%
    summary
## 
## Call:
## lm(formula = q ~ I(p^2), data = newbroiler)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.128  -5.888  -3.190   6.574  15.670 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   42.415      1.702  24.915  < 2e-16 ***
## I(p^2)        -4.650      0.549  -8.471 3.14e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.757 on 50 degrees of freedom
## Multiple R-squared:  0.5893, Adjusted R-squared:  0.5811 
## F-statistic: 71.75 on 1 and 50 DF,  p-value: 3.138e-11
```

9.8 Overfitting

Sobreajustamento ou overfitting é o fenômeno em que o modelo tem uma complexidade excessiva para explicar o fenômeno, fazendo excelentes previsões para os pontos conhecidos da amostra, mas não conseguindo lidar bem com pontos desconhecidos.

No exemplo a seguir, temos um conjunto de pontos que pode ser adequadamente aproximado por uma reta mas que foi sobreajustado por um polinômio de quarto grau e outro de nono grau.

```{r,fig.asp=1, fig.width=5}
# semente aleatória
set.seed(112)
# 10 valores de x
x = 1:10
# 10 valores de y
y = 2 * x + 2 * rnorm(x)
# regressão por um polinômio de grau 9
lm(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) +
    I(x^8) + I(x^9)) -> m9
# regressão por um polinômio de grau 4
lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) -> m4
# regressão linear
lm(y ~ x) -> m1
# calcula a curva polinomial
xx = seq(-0.3 + min(x), max(x) + 0.2, length.out = 100)
yy = predict(m9, data.frame(x = xx))
yy4 = predict(m4, data.frame(x = xx))
# plota os pontos respeitando os máximos e mínimos da curva
# polinomial
plot(x, y, ylim = c(min(c(0, y, yy, yy4)), max(c(y, yy, yy4)) *
    1.01), xlim = c(min(c(0, x, xx)), max(c(x, xx)) * 1.01),
    xlab = expression(x), ylab = expression(y), pch = 20, col = "#C00000")
# desenha a curva polinomial
lines(xx, yy, col = "red")
lines(xx, yy4, col = "darkgray")
# resenha uma simples reta de regressão
abline(m1, col = "blue")
grid()
legend("topleft", legend = paste("Grau", c(1, 4, 9)), fill = c("blue",
    "darkgray", "red"), cex = 0.7)

```

Percebe-se que no polinômio de grau 9 o ajuste ficará muito ruim para pontos fora da amostra, em especial aqueles que ultrapassarem os limites desta.

AULA 4

```{r,}
library(magrittr)
```

10 Chapter 5 The Multiple Regression Model

10.1 5.1 The General Model

Supondo que nossa amostra possui \(m\) variáveis explicativas e \(n\) casos observados

\[ \widehat{y} = b_0 + b_1x_1 + b_2x_2+\cdots+b_mx_m \]

10.2 5.2 Example: Big Andy’s Hamburger Sales

\[ sales=\beta_0+\beta_1price+\beta_2advert+e \]

\(H_1: \beta_1\) deve ser negativo

\(H_2: \beta_2\) deve ser positivo

Freund, J. E., & Simon, G. A. 2000. Estatística aplicada: Economia, administração e contabilidade (9th ed.). Porto Alegre: Bookman.

minhabiblioteca.ufpr.br

```{r,}
library(PoEdata)
data(andy)
head(andy)
sales price advert
73.2 5.69 1.3
71.8 6.49 2.9
62.4 5.63 0.8
67.4 6.22 0.7
89.3 5.02 1.5
70.3 6.41 1.3
plot(andy$price, andy$sales)

plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
    pch = 20, col = "darkgray")

```
```{r,}
library(xtable)
mod1 <- lm(sales ~ price + advert, data = andy)
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.3
kable(summary(mod1) %>%
    xtable %>%
    data.frame)
Estimate Std..Error t.value Pr…t..
(Intercept) 118.913610 6.3516375 18.721725 0.0000000
price -7.907854 1.0959930 -7.215242 0.0000000
advert 1.862584 0.6831955 2.726283 0.0080382
summary(mod1)
## 
## Call:
## lm(formula = sales ~ price + advert, data = andy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.4825  -3.1434  -0.3456   2.8754  11.3049 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 118.9136     6.3516  18.722  < 2e-16 ***
## price        -7.9079     1.0960  -7.215 4.42e-10 ***
## advert        1.8626     0.6832   2.726  0.00804 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.886 on 72 degrees of freedom
## Multiple R-squared:  0.4483, Adjusted R-squared:  0.4329 
## F-statistic: 29.25 on 2 and 72 DF,  p-value: 5.041e-10
```
```{r,}
library(lmtest)
# Teste 1: normalidade dos resíduos estará tudo OK se eu
# não rejeitar a H0 de normalidade
shapiro.test(mod1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod1$residuals
## W = 0.98797, p-value = 0.7028
# Teste 2: forma funcional (teste RESET de Ramsey) se
# aceitar H0 a forma funcional é adequada
reset(mod1)
## 
##  RESET test
## 
## data:  mod1
## RESET = 0.53369, df1 = 2, df2 = 70, p-value = 0.5888
# Teste 3: heteroscedasticidade se não rejeitar H0 os
# resultados são homoscedásticos
bptest(mod1)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod1
## BP = 2.5722, df = 2, p-value = 0.2763
```
```{r,}
library(effects)
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
effprice <- effect("price", mod1)
plot(effprice)

```
```{r,}
alleffandy <- allEffects(mod1)
plot(alleffandy)

```
```{r,}
mod2 <- lm(sales ~ price + advert + I(advert^2), data = andy)
summary(mod2)
## 
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2553  -3.1430  -0.0117   2.8513  11.8050 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 109.7190     6.7990  16.137  < 2e-16 ***
## price        -7.6400     1.0459  -7.304 3.24e-10 ***
## advert       12.1512     3.5562   3.417  0.00105 ** 
## I(advert^2)  -2.7680     0.9406  -2.943  0.00439 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.645 on 71 degrees of freedom
## Multiple R-squared:  0.5082, Adjusted R-squared:  0.4875 
## F-statistic: 24.46 on 3 and 71 DF,  p-value: 5.6e-11
```
```{r,}
library(lmtest)
# Teste 1: normalidade dos resíduos estará tudo OK se eu
# não rejeitar a H0 de normalidade
shapiro.test(mod2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod2$residuals
## W = 0.98775, p-value = 0.6894
# Teste 2: forma funcional (teste RESET de Ramsey) se
# aceitar H0 a forma funcional é adequada
reset(mod2)
## 
##  RESET test
## 
## data:  mod2
## RESET = 2.7695, df1 = 2, df2 = 69, p-value = 0.06967
# Teste 3: heteroscedasticidade se não rejeitar H0 os
# resultados são homoscedásticos
bptest(mod1)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod1
## BP = 2.5722, df = 2, p-value = 0.2763
```
```{r,}
alleffandy <- allEffects(mod2)
plot(alleffandy)

```
```{r,}
plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
    pch = 20, col = "darkgray")
adv = seq(min(andy$advert), max(andy$advert), length.out = 100)
y = predict(mod1, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "red", lwd = 2)
y = predict(mod2, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "blue", lwd = 2)

```
```{r,}
plot(mod1)

```
```{r,}
plot(mod2)

```

Matrizes de covariância

```{r,}
vcov(mod1)
(Intercept) price advert
(Intercept) 40.3432990 -6.7950641 -0.7484206
price -6.7950641 1.2012007 -0.0197422
advert -0.7484206 -0.0197422 0.4667561
vcov(mod2)
(Intercept) price advert I(advert^2)
(Intercept) 46.227019 -6.4261130 -11.6009601 2.9390263
price -6.426113 1.0939881 0.3004062 -0.0856191
advert -11.600960 0.3004062 12.6463020 -3.2887457
I(advert^2) 2.939026 -0.0856191 -3.2887457 0.8847736
cbind(andy$sales, andy$price, andy$advert, andy$advert^2) %>%
    var
42.101106 -2.1042341 1.1984270 3.3387205
-2.104234 0.2687718 0.0113681 0.0682647
1.198427 0.0113681 0.6916865 2.5721319
3.338720 0.0682647 2.5721319 9.8969231
```

Conseguimos comprovar as nossas hipóteses?

Conseguimos controlar as hipóteses alternativas?

\(R^2_{aj}(mod1)=0.4329316\) e \(R^2_{aj}(mod2)=0.4874564\)

```{r,}
```

10.3 Distribuição t ou distribuição normal?

```{r,}
qnorm(1 - 0.05/2) %>%
    round(4)
## [1] 1.96
qt(1 - 0.05/2, 5000 - 2)
## [1] 1.960439
qt(1 - 0.05/2, 500 - 2)
## [1] 1.964739
qt(1 - 0.05/2, 200 - 2)
## [1] 1.972017
qt(1 - 0.05/2, 120 - 2)
## [1] 1.980272
qt(1 - 0.05/2, 30 - 2)
## [1] 2.048407
```

10.4 modelo polinomial do segundo grau para Andy

```{r,}
mod2
## 
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
## 
## Coefficients:
## (Intercept)        price       advert  I(advert^2)  
##     109.719       -7.640       12.151       -2.768
```

$$ = 109.719 -7.640 price + 12.151 advert -2.768 advert^2

$$

Qual o valor máximo a ser investido em propaganda?

Fica perto do ponto onde a parábola atinge o seu máximo.

Ponto de máximo:

\[ advert = \dfrac{-b}{2a} = \dfrac{-12.151}{2(-2.768)} = \dfrac{12.151}{5.536} \approx 2.194906 \]

```{r,}
12.151/(2 * 2.768)
## [1] 2.194906
```
```{r,}
plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
    pch = 20, col = "darkgray")
adv = seq(min(andy$advert), max(andy$advert), length.out = 100)
y = predict(mod1, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "red", lwd = 2)
y = predict(mod2, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "blue", lwd = 2)
xmax = 12.151/(2 * 2.768)
ymax = predict(mod2, data.frame(price = mean(andy$price), advert = xmax))
abline(v = xmax, h = ymax, lty = 2, col = "blue")
points(xmax, ymax, col = "blue")
text(xmax, ymax, paste("(", xmax %>%
    round(2), ", ", ymax %>%
    round(2), ")", sep = ""), pos = 3, col = "blue")
grid()

mean(andy$price)
## [1] 5.6872
```

10.5 5.6 Interaction Terms in Linear Regression

```{r,}
data("pizza4", package = "PoEdata")
head(pizza4)
pizza female hs college grad income age
109 1 0 0 0 19.5 25
0 1 0 0 0 39.0 45
0 1 0 0 0 15.6 20
108 1 0 0 0 26.0 28
220 1 1 0 0 19.5 25
189 1 1 0 0 39.0 35
`?`(pizza4)
pizza4R Documentation

Pizza4 Data

Description

Obs: 40 individuals

Usage

data("pizza4")

Format

A data frame with 40 observations on the following 7 variables.

pizza

annual pizza expenditure, $

female

=1 if female

hs

=1 if highest degree received is high school diploma

college

=1 if highest degree received is a college diploma

grad

=1 if highest degree received is a post graduate degree

income

annual income in thousands of dollars

age

age in years

Source

http://principlesofeconometrics.com/poe4/poe4.htm

Examples

data(pizza4)
## maybe str(pizza4) ; plot(pizza4) ...
```
```{r,}
mod3 = lm(pizza ~ income, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ income, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -260.17 -103.81  -49.86  122.59  337.12 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 128.9803    34.5913   3.729 0.000626 ***
## income        1.1213     0.4595   2.440 0.019461 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146.8 on 38 degrees of freedom
## Multiple R-squared:  0.1355, Adjusted R-squared:  0.1127 
## F-statistic: 5.954 on 1 and 38 DF,  p-value: 0.01946
mod3 = lm(pizza ~ age, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ age, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -235.90 -131.28  -18.39  107.80  393.60 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  301.728     84.206   3.583 0.000951 ***
## age           -3.291      2.408  -1.367 0.179671    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 154.2 on 38 degrees of freedom
## Multiple R-squared:  0.04687,    Adjusted R-squared:  0.02179 
## F-statistic: 1.869 on 1 and 38 DF,  p-value: 0.1797
mod3 = lm(pizza ~ age + income, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ age + income, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.96 -116.36   27.85   88.02  287.04 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 342.8848    72.3434   4.740 3.14e-05 ***
## age          -7.5756     2.3170  -3.270 0.002333 ** 
## income        1.8325     0.4643   3.947 0.000341 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 131.1 on 37 degrees of freedom
## Multiple R-squared:  0.3293, Adjusted R-squared:  0.293 
## F-statistic: 9.081 on 2 and 37 DF,  p-value: 0.0006185
mod3 = lm(pizza ~ age + income + age:income, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ age + income + age:income, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -200.86  -83.82   20.70   85.04  254.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 161.46543  120.66341   1.338   0.1892  
## age          -2.97742    3.35210  -0.888   0.3803  
## income        6.97991    2.82277   2.473   0.0183 *
## age:income   -0.12324    0.06672  -1.847   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared:  0.3873, Adjusted R-squared:  0.3363 
## F-statistic: 7.586 on 3 and 36 DF,  p-value: 0.0004681
mod3 = lm(pizza ~ age + income + income:age, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ age + income + income:age, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -200.86  -83.82   20.70   85.04  254.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 161.46543  120.66341   1.338   0.1892  
## age          -2.97742    3.35210  -0.888   0.3803  
## income        6.97991    2.82277   2.473   0.0183 *
## age:income   -0.12324    0.06672  -1.847   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared:  0.3873, Adjusted R-squared:  0.3363 
## F-statistic: 7.586 on 3 and 36 DF,  p-value: 0.0004681
mod3 = lm(pizza ~ age * income, data = pizza4)
mod3 %>%
    summary
## 
## Call:
## lm(formula = pizza ~ age * income, data = pizza4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -200.86  -83.82   20.70   85.04  254.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 161.46543  120.66341   1.338   0.1892  
## age          -2.97742    3.35210  -0.888   0.3803  
## income        6.97991    2.82277   2.473   0.0183 *
## age:income   -0.12324    0.06672  -1.847   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared:  0.3873, Adjusted R-squared:  0.3363 
## F-statistic: 7.586 on 3 and 36 DF,  p-value: 0.0004681
```
```{r,}
data(cps4_small)
head(cps4_small)
wage educ exper hrswk married female metro midwest south west black asian
18.70 16 39 37 1 1 1 0 1 0 0 0
11.50 12 16 62 0 0 0 1 0 0 0 0
15.04 16 13 40 1 0 1 0 0 1 1 0
25.95 14 11 40 0 1 1 0 1 0 1 0
24.03 12 51 40 1 0 1 0 0 0 0 0
20.00 12 30 40 1 0 1 0 0 0 0 0
```
```{r,}
# lembram da função mhist que foi definida no arquivo
# 'Pequenos Experimentos.Rmd'?  (dêem uma olhada lá) criei
# um arquivo que contém essa função (mhist.R) e para tornar
# essa funcionalidade disponível basta carregar o arquivo
# direto do github
source("https://raw.githubusercontent.com/jrfrega/2025/refs/heads/main/mhist.R")
# wage apresenta muitos outliers superiores
mhist(cps4_small$wage)

boxplot(cps4_small$wage)

# a transformação log reduz o problema a uns poucos
# outliers inferiores
mhist(cps4_small$wage %>%
    log)

boxplot(cps4_small$wage %>%
    log)

# Lembrem de que qualquer modelo precisa ser suportado pela
# adequada teoria aqui está sendo proposta uma interação
# entre educ e exper e uma utilidade decrescente da exper
mod4 = lm(log(wage) ~ educ * exper + I(exper^2), data = cps4_small)
summary(mod4)
## 
## Call:
## lm(formula = log(wage) ~ educ * exper + I(exper^2), data = cps4_small)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28227 -0.32856 -0.02725  0.33751  1.47088 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.297e-01  2.267e-01   2.336  0.01969 *  
## educ         1.272e-01  1.472e-02   8.642  < 2e-16 ***
## exper        6.298e-02  9.536e-03   6.604 6.48e-11 ***
## I(exper^2)  -7.139e-04  8.804e-05  -8.109 1.49e-15 ***
## educ:exper  -1.322e-03  4.949e-04  -2.672  0.00766 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5057 on 995 degrees of freedom
## Multiple R-squared:  0.2445, Adjusted R-squared:  0.2415 
## F-statistic: 80.52 on 4 and 995 DF,  p-value: < 2.2e-16
with(cps4_small, plot(educ, wage))

with(cps4_small, plot(exper, wage))

```

11 Chapter 6 Further Inference in Multiple Regression

Necessidade de indicadores mais robustos que o \(R^2\) para determinar a qualidade de uma regressão.

Modernamente, usa-se os critérios de informação

AIC (Akaike Information Criterion)

BIC (Bayesian Information Criterion)

```{r,}
library(broom)
mod1
## 
## Call:
## lm(formula = sales ~ price + advert, data = andy)
## 
## Coefficients:
## (Intercept)        price       advert  
##     118.914       -7.908        1.863
mod2
## 
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
## 
## Coefficients:
## (Intercept)        price       advert  I(advert^2)  
##     109.719       -7.640       12.151       -2.768
rbind(glance(mod1), glance(mod2))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.4482578 0.4329316 4.886124 29.24786 0 2 -223.8695 455.739 465.0090 1718.943 72 75
0.5082352 0.4874564 4.645283 24.45932 0 3 -219.5540 449.108 460.6954 1532.084 71 75
```

O melhor modelo é o que apresenta o menor valor para um dado critério de informação.

Com base no AIC, o modelo 1 (mod1) é pior que o modelo 2 (mod2) (455.739 > 449.108)

Os critérios de informação balanceiam a parcimônia do modelo com a qualidade do ajuste.

```{r,}
mod1$call
## lm(formula = sales ~ price + advert, data = andy)
mod2.1 = lm(formula = sales ~ price + advert + I(advert^2) +
    I(price^2), data = andy)
rbind(glance(mod1), glance(mod2), glance(mod2.1))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.4482578 0.4329316 4.886124 29.24786 0 2 -223.8695 455.7390 465.0090 1718.943 72 75
0.5082352 0.4874564 4.645283 24.45932 0 3 -219.5540 449.1080 460.6954 1532.084 71 75
0.5279827 0.5010102 4.583451 19.57491 0 4 -218.0171 448.0341 461.9391 1470.561 70 75
mod2.2 = lm(formula = sales ~ price + advert + I(advert^2) +
    I(advert^3) + I(price^2), data = andy)
rbind(glance(mod1), glance(mod2), glance(mod2.1), glance(mod2.2)) %>%
    as.data.frame -> df
row.names(df) = c("mod1 (linear)", "mod2 (quadrático em advert)",
    "mod2.1 (quadrático em tudo)", "mod2.2 (quadrático em price, cúbico em advert")
df[, c("adj.r.squared", "AIC", "BIC")]
adj.r.squared AIC BIC
mod1 (linear) 0.4329316 455.7390 465.0090
mod2 (quadrático em advert) 0.4874564 449.1080 460.6954
mod2.1 (quadrático em tudo) 0.5010102 448.0341 461.9391
mod2.2 (quadrático em price, cúbico em advert 0.4938355 450.0257 466.2481
summary(mod2.1)
## 
## Call:
## lm(formula = sales ~ price + advert + I(advert^2) + I(price^2), 
##     data = andy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1771  -2.9378   0.0949   2.8897  11.6856 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 248.8390    81.5712   3.051 0.003225 ** 
## price       -57.0629    28.8988  -1.975 0.052262 .  
## advert       13.1624     3.5582   3.699 0.000427 ***
## I(advert^2)  -3.0896     0.9469  -3.263 0.001708 ** 
## I(price^2)    4.3364     2.5340   1.711 0.091454 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.583 on 70 degrees of freedom
## Multiple R-squared:  0.528,  Adjusted R-squared:  0.501 
## F-statistic: 19.57 on 4 and 70 DF,  p-value: 7.553e-11
```

Pelo critério Akaike (AIC), o melhor modelo é mod2.1 (quadrático em tudo).

Pelo critério de Schwarz (AIC), o melhor modelo é mod2 (quadrático em advert).